Setup the Environment

CSS Styling
writeLines("td, th { padding : 3px } th { background-color:white; color:black; border:1px solid black; text-align:center } td {color:black; border:1px solid black; word-wrap:break-word; white-space:nowrap; overflow: hidden; text-overflow: ellipsis; max-width:300px; text-align:left}", con= "../css/style.css")

Visualization of Metabolites Between Replicate Pairs

Boxplots (Original Abundances)

# Plot Boxplots faceted by shared metabolites
################################################################################
p <- plot_df %>%
  ggplot(aes(y = METABOLITE_NAME, x = VALUE, color = REPLICATE)) +
  geom_boxplot(alpha = 0.8) +
  labs(title=paste0(tissue,' ',study_institute_metab_family,": Original Metabolite Abundances"),
             x = "Abundance", y = "") 
plot(p)

Zero/Missing Values in Metabolites (Original Abundances)

# Identify Zero/Missing Values in Metabolites
################################################################################
na_df <- plot_df %>%
  ungroup() %>% 
  group_by(METABOLITE_NAME, REPLICATE) %>%
  mutate(ZERO_LOG = ifelse(VALUE == 0, 1, 0)) %>%
  mutate(ZERO_N = sum(ZERO_LOG)) %>%
  mutate(ZERO_FREQ = round(ZERO_N/length(shared_sams), digits = 2)) %>%
  mutate(NA_LOG = ifelse(is.na(VALUE), 1, 0)) %>%
  mutate(NA_N = sum(NA_LOG)) %>%
  mutate(NA_FREQ = round(NA_N/length(shared_sams), digits = 2)) %>%
  select(REPLICATE, METABOLITE_NAME, ZERO_N, ZERO_FREQ, NA_N, NA_FREQ) %>%
  unique() %>%
  arrange(REPLICATE, desc(ZERO_FREQ), desc(NA_FREQ)) %>%
  ungroup()

na_df %>%
  knitr::kable(format = "html") %>%
  scroll_box(width = "100%", height = "400px")
REPLICATE METABOLITE_NAME ZERO_N ZERO_FREQ NA_N NA_FREQ
011 Aminoisobutyric acid 78 1.00 0 0
011 3-Methylhistidine 53 0.68 0 0
011 1-Methylhistidine 26 0.33 0 0
011 beta-Alanine 6 0.08 0 0
011 Sarcosine 4 0.05 0 0
011 Aminoadipic acid 2 0.03 0 0
011 gamma-Aminobutyric acid 2 0.03 0 0
011 Alanine 0 0.00 0 0
011 Arginine 0 0.00 0 0
011 Asparagine 0 0.00 0 0
011 Aspartic acid 0 0.00 0 0
011 Citrulline 0 0.00 0 0
011 Cystathionine 0 0.00 0 0
011 Cysteine 0 0.00 0 0
011 Ethanolamine 0 0.00 0 0
011 Glutamic acid 0 0.00 0 0
011 Glutamine 0 0.00 0 0
011 Glycine 0 0.00 0 0
011 Histidine 0 0.00 0 0
011 Hydroxyproline 0 0.00 0 0
011 Isoleucine 0 0.00 0 0
011 Leucine 0 0.00 0 0
011 Lysine 0 0.00 0 0
011 Methionine 0 0.00 0 0
011 Ornithine 0 0.00 0 0
011 Phenylalanine 0 0.00 0 0
011 Phosphoethanolamine 0 0.00 0 0
011 Proline 0 0.00 0 0
011 Serine 0 0.00 0 0
011 Taurine 0 0.00 0 0
011 Threonine 0 0.00 0 0
011 Tryptophan 0 0.00 0 0
011 Tyrosine 0 0.00 0 0
011 Valine 0 0.00 0 0
011 2-Aminobutyric acid 0 0.00 0 0
012 Aminoisobutyric acid 78 1.00 0 0
012 3-Methylhistidine 56 0.72 0 0
012 1-Methylhistidine 20 0.26 0 0
012 Cystathionine 1 0.01 0 0
012 beta-Alanine 1 0.01 0 0
012 gamma-Aminobutyric acid 1 0.01 0 0
012 Alanine 0 0.00 0 0
012 Arginine 0 0.00 0 0
012 Asparagine 0 0.00 0 0
012 Aspartic acid 0 0.00 0 0
012 Citrulline 0 0.00 0 0
012 Cysteine 0 0.00 0 0
012 Ethanolamine 0 0.00 0 0
012 Glutamic acid 0 0.00 0 0
012 Glutamine 0 0.00 0 0
012 Glycine 0 0.00 0 0
012 Histidine 0 0.00 0 0
012 Hydroxyproline 0 0.00 0 0
012 Isoleucine 0 0.00 0 0
012 Leucine 0 0.00 0 0
012 Lysine 0 0.00 0 0
012 Methionine 0 0.00 0 0
012 Ornithine 0 0.00 0 0
012 Phenylalanine 0 0.00 0 0
012 Phosphoethanolamine 0 0.00 0 0
012 Proline 0 0.00 0 0
012 Sarcosine 0 0.00 0 0
012 Serine 0 0.00 0 0
012 Taurine 0 0.00 0 0
012 Threonine 0 0.00 0 0
012 Tryptophan 0 0.00 0 0
012 Tyrosine 0 0.00 0 0
012 Valine 0 0.00 0 0
012 2-Aminobutyric acid 0 0.00 0 0
012 Aminoadipic acid 0 0.00 0 0
# Remove Metabolites with high NAor Zero Frequencies
################################################################################
met_rm <- na_df %>%
  filter((ZERO_FREQ >= 0.95) | (NA_FREQ >= 0.8)) %>%
  select(METABOLITE_NAME) %>% unlist() %>% unique()
metabolites <- metabolites[metabolites %!in% met_rm]
plot_df <- plot_df %>%
  filter(METABOLITE_NAME %in% metabolites)
plot_df$METABOLITE_NAME <- as.character(plot_df$METABOLITE_NAME)

Distributions (Original Abundances)

# Summary Statistics of Density plot distribution
################################################################################
# Iterate through the runs and collect vectors
df_all <- data.frame()
for(rep in c(reps)){
  # Collect numeric vector
  num_vec <- plot_df %>%
    filter(REPLICATE == rep) %>%
    select(VALUE) %>% unlist() %>% unname()
  df <- NumericSummaryStats(num_vec)
  row.names(df) <- rep
  df_all <- rbind(df_all, df)
}
df_all %>%
  knitr::kable(format = "html") %>%
  scroll_box(width = "100%", height = "200px")
NA_COUNT NA_FREQ MEAN MEDIAN MAX MIN MID_RANGE VARIANCE STD_DEV SE_MEAN Q1 Q3 KURTOSIS SKEW BC_LAMBDA
011 0 0 0.60 0.22 11.90 0 11.90 1.22 1.10 0.02 0.04 0.67 23.54 4.21 None
012 0 0 0.58 0.23 9.71 0 9.71 1.05 1.02 0.02 0.04 0.63 19.60 3.93 None
# Plot the density plot for all the gene counts
################################################################################
plot_df %>%
  ggplot(aes(x = VALUE, color = REPLICATE)) +
  geom_density() +
  xlim(min(df_all$MIN), mean(df_all$Q3)) +
  labs(title=paste0(tissue,' ',study_institute_metab_family,": Original Metabolite Abundances"),
       x = "Abundance",
       y = "Density")

Metabolite by Metabolite Correlation (Original Abundances)

################################################################################
####################### Metabolite by Metabolite Correlations ##################
################################################################################
# Subset Matrices
x <- plot_df %>%
  unite(METABOLITE_NAME,REPLICATE, 
        col = METABOLITE_NAME_REPLICATE, remove = F) %>%
  select(METABOLITE_NAME_REPLICATE, labelid, VALUE,REPLICATE) %>%
  split(.$REPLICATE) %>%
  map(select, -REPLICATE) %>%
  map(pivot_wider, names_from = METABOLITE_NAME_REPLICATE, values_from = VALUE) %>%
  map(arrange, labelid) %>%
  map(tibble::column_to_rownames, var = "labelid") %>%
  map(as.matrix)

# Subset matrices
if(all(row.names(x[[1]]) == row.names(x[[2]]))){
  shared_met_mat <-  do.call(cbind,x) 
}

Corr <- 'Spearman'
# Create an NxN Matrix of correlation
if(Corr == 'Pearson'){
        cor1 <- stats::cor(shared_met_mat, method = 'pearson') %>% round(digits = 3)
}else if(Corr == 'Spearman'){
        cor1 <- stats::cor(shared_met_mat, method = 'spearman') %>% round(digits = 3)
}


# Reorder the correlation matrix
cormat <- cor1

# Melt the correlation matrix
melted_cormat <- melt(cormat, na.rm = TRUE)

################################################################################
####################### Only Reciprocal Pairs ##################################
################################################################################

# Remove rows that compare metabolites from the same dataset, them remove redundent measurements
melted_cormat2 <- melted_cormat 
# Widen the datafrmae back to matrix
cormat2 <- melted_cormat2 %>%
  pivot_wider(names_from = Var2, values_from = value) %>%
  as.data.frame()
row.names(cormat2) <- cormat2$Var1
cormat2 <- cormat2 %>%
  select(-Var1) %>%
  as.matrix()
cormat2 <- cormat2[sort(rownames(cormat2)),sort(colnames(cormat2))]
cormat2 <- cormat2[grepl(reps[1], rownames(cormat2)),grepl(reps[2], colnames(cormat2))]

# Orient the colors and breaks
paletteLength <- 1000
myColor <- colorRampPalette(c("blue", "white", "red"))(paletteLength)
# length(breaks) == length(paletteLength) + 1
# use floor and ceiling to deal with even/odd length pallettelengths
myBreaks <- c(seq(-1, 0, length.out=ceiling(paletteLength/2) + 1), 
              seq(1/paletteLength, max(cor1, na.rm = T), length.out=floor(paletteLength/2)))

# Plot the heatmap
pheatmap(cormat2, color=myColor, breaks=myBreaks, 
                  cluster_rows = F, cluster_cols = F,fontsize = 6)

Scatterplots (Original Abundances)

# Subset the data for sites to compare (scatterplots)
###################################################
plot_join_df <- countdata_df %>%
  ungroup() %>%
  unite(STUDY_INSTITUTE,METAB_FAMILY, col = "STUDY_INSTITUTE_METAB_FAMILY") %>%
  filter(TISSUE == tissue) %>%
  filter(NAMED == named) %>%
  filter(STUDY_INSTITUTE_METAB_FAMILY == study_institute_metab_family) %>%
  unnest(COUNT_DATA) %>%
  filter(viallabel %in% as.character(unlist(sample_join$sample_id))) %>%
  filter(METABOLITE_NAME %in% metabolites) %>%
  unite(labelid, viallabel, col = "labelid_viallabel", remove = F) %>%
  mutate(REPLICATE = case_when(grepl(paste0(reps[1],"$"),labelid_viallabel) ~ reps[1],
                              grepl(paste0(reps[2],"$"),labelid_viallabel) ~ reps[2],
                              )) %>%
  select(REPLICATE,METABOLITE_NAME,labelid,VALUE) %>%
  mutate(REPLICATE = as.character(REPLICATE)) %>%
  split(.$REPLICATE) %>%
  map(~rename(., !!sym(unique(.$REPLICATE)) := "VALUE")) %>%
  map(~select(., -REPLICATE)) %>%
  purrr::reduce(left_join, by = c("labelid","METABOLITE_NAME"))

# Plot scatter plots
################################################################################
# Plot scatter plots ordered by sample (include R2 values)
lm_df <- plot_join_df %>%
  rename(name = METABOLITE_NAME) %>%
  rename(x = reps[1]) %>%
  rename(y = reps[2])
# Iterate through each of the shared metabolites to collect summary info about their correlations
r2_df <- data.frame()
for(metab in metabolites){
  met_df <- lm_df %>%
    filter(name == metab)
  r2_df <- rbind(r2_df, lm_eqn(met_df))
}
# Display summaries
met_r2_df <- r2_df %>%
  arrange(METABOLITE) %>%
  select(METABOLITE,R2) %>%
  arrange(desc(R2))
met_r2_df %>%
  knitr::kable(format = "html") %>%
  scroll_box(width = "100%", height = "200px")
METABOLITE R2
2-Aminobutyric acid 0.624
gamma-Aminobutyric acid 0.613
Alanine 0.453
Tyrosine 0.402
Tryptophan 0.395
Leucine 0.348
Methionine 0.344
Citrulline 0.302
Glutamic acid 0.299
Ethanolamine 0.27
Isoleucine 0.265
Asparagine 0.256
Threonine 0.246
Hydroxyproline 0.234
Aspartic acid 0.227
Lysine 0.225
Valine 0.219
Glutamine 0.213
Serine 0.213
Phenylalanine 0.212
Arginine 0.206
Sarcosine 0.204
Taurine 0.201
Aminoadipic acid 0.195
Proline 0.164
Glycine 0.156
Phosphoethanolamine 0.145
Cysteine 0.136
Histidine 0.132
Cystathionine 0.126
Ornithine 0.0668
beta-Alanine 0.027
1-Methylhistidine 0.0198
3-Methylhistidine 0.014
# Collect the order of metabolites
scat_met_order <- met_r2_df %>% select(METABOLITE) %>% unlist() %>% as.character()

# Plot the scatter plots
for(metab in scat_met_order){
  p <- plot_join_df %>%
    filter(METABOLITE_NAME == metab) %>%
  ggplot(aes(x = !!sym(reps[1]), y = !!sym(reps[2])), color = ) +
  geom_point(alpha = 0.5) +
  geom_smooth(method = "lm", size = 1) +
  geom_abline(linetype="dashed") +
  facet_wrap(~ METABOLITE_NAME) +
  expand_limits(x = 0, y = 0) +
  #labs(title=paste0(tissue,' ',study_institute_metab_family,": Normalized Metabolite Abundances")) +
  coord_fixed(ratio=1)
  plot(p)
}

Sample by Sample Correlations (Original Abundances)

################################################################################
####################### Sample by Sample Correlations ##########################
################################################################################

# Subset Data
################################################################################
# Original NxP Matrix
x <- plot_df %>%
  unite(labelid,REPLICATE, 
        col = labelid_REPLICATE, remove = F) %>%
  select(labelid_REPLICATE, METABOLITE_NAME, VALUE,REPLICATE) %>%
  split(.$REPLICATE) %>%
  map(select, -REPLICATE) %>%
  map(pivot_wider, names_from = METABOLITE_NAME, values_from = VALUE) %>%
  map(tibble::column_to_rownames, var = "labelid_REPLICATE") %>%
  map(as.matrix)
# Subset matrices
shared_sam_mat <-  do.call(rbind,x) %>% 
  t()

# Create an NxN Matrix of correlation
Corr <- 'Spearman'
if(Corr == 'Pearson'){
        cor1 <- stats::cor(shared_sam_mat, method = 'pearson') %>% round(digits = 3)
}else if(Corr == 'Spearman'){
        cor1 <- stats::cor(shared_sam_mat, method = 'spearman') %>% round(digits = 3)
}

# Melt the correlation matrix
melted_cormat <- melt(cor1, na.rm = TRUE)

################################################################################
####################### Unclustered ############################################
################################################################################

# Remove rows that compare metabolites from the same dataset, them remove redundent measurements
melted_cormat2 <- melted_cormat

# Widen the dataframwe back to matrix
cormat2 <- melted_cormat2 %>%
  pivot_wider(names_from = Var2, values_from = value) %>%
  as.data.frame()
row.names(cormat2) <- cormat2$Var1
cormat2 <- cormat2 %>%
  select(-Var1) %>%
  as.matrix()
cormat2 <- cormat2[sort(rownames(cormat2)),sort(colnames(cormat2))]
cormat2 <- cormat2[grepl(reps[1], rownames(cormat2)),grepl(reps[2], colnames(cormat2))]

# Orient the colors and breaks
paletteLength <- 1000
myColor <- colorRampPalette(c("blue", "white", "red"))(paletteLength)
# length(breaks) == length(paletteLength) + 1
# use floor and ceiling to deal with even/odd length pallettelengths
myBreaks <- c(seq(-1, 0, length.out=ceiling(paletteLength/2) + 1), 
              seq(1/paletteLength, max(cor1, na.rm = TRUE), length.out=floor(paletteLength/2)))

# Plot the heatmap
pheatmap(as.matrix(cormat2), color=myColor, breaks=myBreaks, 
                  cluster_rows = F, cluster_cols = F,fontsize = 6)

PCA Analysis (Original Abundances)

pca <- prcomp(na.omit(t(shared_sam_mat)), scale. = F)
percentVar <- pca$sdev^2/sum(pca$sdev^2)
PC <- pca$x %>% as.data.frame()
PC$labelid_rep <- as.character(row.names(pca$x))
PC <- PC %>%
  tidyr::separate(col = labelid_rep, into = c('labelid', 'REPLICATE'),
  sep = "_", remove = T)
# Join the phenotype data
PC <- left_join(PC, pheno_df, by = 'labelid')

p0 <- PC %>%
  ggplot(aes(x = PC1, y = PC2, color=REPLICATE)) +
  geom_point(size = 3) +
  geom_line(aes(group = labelid),color="dark grey") +
  ggtitle('Shared Samples and Shared Metabolites') + 
  xlab(paste0('PC1: ',round(percentVar[1]*100,2),'% variance')) +
  ylab(paste0('PC2: ',round(percentVar[2]*100,2),'% variance')) +
  theme(aspect.ratio=1)
p0

p1 <- PC %>%
        ggplot(aes(x = PC1, y = PC2, color=Key.anirandgroup, shape = REPLICATE)) +
        geom_point(size = 3) +
  geom_line(aes(group = labelid),color="dark grey") +
        ggtitle('Shared Samples and Shared Metabolites') + 
        xlab(paste0('PC1: ',round(percentVar[1]*100,2),'% variance')) +
        ylab(paste0('PC2: ',round(percentVar[2]*100,2),'% variance')) +
        scale_color_manual(values=ec_colors) +
        theme(aspect.ratio=1)
p1

PC$Registration.sex <- as.character(PC$Registration.sex)
p2 <- PC %>%
        ggplot(aes(x = PC1, y = PC2, color=Registration.sex, shape = REPLICATE)) +
        geom_point(size = 3) +
  geom_line(aes(group = labelid),color="dark grey") +
        ggtitle('Shared Samples and Shared Metabolites') + 
        xlab(paste0('PC1: ',round(percentVar[1]*100,2),'% variance')) +
        ylab(paste0('PC2: ',round(percentVar[2]*100,2),'% variance')) +
        theme(aspect.ratio=1)
p2

Normalization (AutoScale)

# Normalize
# Note in this step, we use labelid to account for mayo's duplicate samples
################################################################################
norm_df <- plot_df %>%
  unite(labelid, viallabel, col = "labelid_viallabel", remove = F) %>%
  split(.$REPLICATE) %>%
  map(select, labelid_viallabel, METABOLITE_NAME, VALUE) %>%
  map(pivot_wider, names_from = METABOLITE_NAME, values_from = VALUE) %>%
  map(column_to_rownames, var = "labelid_viallabel") %>%
  map(as.matrix) %>%
  map(AutoScaleMatrix) %>%
  map(as.data.frame) %>%
  map(rownames_to_column, var = "labelid_viallabel") %>%
  map(pivot_longer, names_to = 'METABOLITE_NAME', values_to = 'VALUE', cols = all_of(metabolites)) %>%
  map(mutate, REPLICATE = case_when(grepl(paste0(reps[1],"$"),labelid_viallabel) ~ reps[1],
                                     grepl(paste0(reps[2],"$"),labelid_viallabel) ~ reps[2],
                                    grepl(paste0(reps[3],"$"),labelid_viallabel) ~ reps[3])) %>%
  bind_rows()

Boxplots (Normalized Abundances)

# Plot Boxplots faceted by shared metabolites
################################################################################
norm_df %>%
  ggplot(aes(y = METABOLITE_NAME, x = VALUE, color = REPLICATE)) +
  geom_boxplot(alpha = 0.8) +
  labs(title=paste0(tissue,' ',study_institute_metab_family,": Normalized Metabolite Abundances"),
             x = "Abundance", y = "") 

Metabolite by Metabolite Correlation (Normalized Abundances)

################################################################################
####################### Metabolite by Metabolite Correlations ##################
################################################################################
# Subset Matrices
x <- plot_df %>%
  unite(METABOLITE_NAME,REPLICATE, 
        col = METABOLITE_NAME_REPLICATE, remove = F) %>%
  select(METABOLITE_NAME_REPLICATE, labelid, VALUE,REPLICATE) %>%
  split(.$REPLICATE) %>%
  map(select, -REPLICATE) %>%
  map(pivot_wider, names_from = METABOLITE_NAME_REPLICATE, values_from = VALUE) %>%
  map(arrange, labelid) %>%
  map(tibble::column_to_rownames, var = "labelid") %>%
  map(as.matrix) %>%
  map(AutoScaleMatrix)

# Subset matrices
if(all(row.names(x[[1]]) == row.names(x[[2]]))){
  shared_met_mat <-  do.call(cbind,x) 
}

Corr <- 'Spearman'
# Create an NxN Matrix of correlation
if(Corr == 'Pearson'){
        cor1 <- stats::cor(shared_met_mat, method = 'pearson') %>% round(digits = 3)
}else if(Corr == 'Spearman'){
        cor1 <- stats::cor(shared_met_mat, method = 'spearman') %>% round(digits = 3)
}

# Reorder the correlation matrix
cormat <- cor1

# Melt the correlation matrix
melted_cormat <- melt(cormat, na.rm = TRUE)

################################################################################
####################### Only Reciprocal Pairs ##################################
################################################################################

# Remove rows that compare metabolites from the same dataset, them remove redundent measurements
melted_cormat2 <- melted_cormat 
# Widen the datafrmae back to matrix
cormat2 <- melted_cormat2 %>%
  pivot_wider(names_from = Var2, values_from = value) %>%
  as.data.frame()
row.names(cormat2) <- cormat2$Var1
cormat2 <- cormat2 %>%
  select(-Var1) %>%
  as.matrix()
cormat2 <- cormat2[sort(rownames(cormat2)),sort(colnames(cormat2))]
cormat2 <- cormat2[grepl(reps[1], rownames(cormat2)),grepl(reps[2], colnames(cormat2))]

# Orient the colors and breaks
paletteLength <- 1000
myColor <- colorRampPalette(c("blue", "white", "red"))(paletteLength)
# length(breaks) == length(paletteLength) + 1
# use floor and ceiling to deal with even/odd length pallettelengths
myBreaks <- c(seq(-1, 0, length.out=ceiling(paletteLength/2) + 1), 
              seq(1/paletteLength, max(cor1, na.rm = T), length.out=floor(paletteLength/2)))

# Plot the heatmap
pheatmap(cormat2, color=myColor, breaks=myBreaks, 
                  cluster_rows = F, cluster_cols = F,fontsize = 6)

Scatterplots (Normalized Abundances)

# Normalize
# Subset the data for sites to compare (scatterplots)
###################################################
norm_join_df <- countdata_df %>%
  ungroup() %>%
  unite(STUDY_INSTITUTE,METAB_FAMILY, col = "STUDY_INSTITUTE_METAB_FAMILY") %>%
  filter(TISSUE == tissue) %>%
  filter(NAMED == named) %>%
  filter(STUDY_INSTITUTE_METAB_FAMILY == study_institute_metab_family) %>%
  unnest(COUNT_DATA) %>%
  filter(viallabel %in% as.character(unlist(sample_join$sample_id))) %>%
  unite(labelid, viallabel, col = "labelid_viallabel", remove = F) %>%
  mutate(REPLICATE = case_when(grepl(paste0(reps[1],"$"),labelid_viallabel) ~ reps[1],
                                     grepl(paste0(reps[2],"$"),labelid_viallabel) ~ reps[2],
                               grepl(paste0(reps[3],"$"),labelid_viallabel) ~ reps[3])) %>%
  split(.$REPLICATE) %>%
  map(select, labelid_viallabel, METABOLITE_NAME, VALUE) %>%
  map(pivot_wider, names_from = METABOLITE_NAME, values_from = VALUE) %>%
  map(column_to_rownames, var = "labelid_viallabel") %>%
  map(as.matrix) %>%
  map(AutoScaleMatrix) %>%
  map(as.data.frame) %>%
  map(rownames_to_column, var = "labelid_viallabel") %>%
  map(pivot_longer, names_to = 'METABOLITE_NAME', values_to = 'VALUE', cols = all_of(metabolites)) %>%
  map(mutate, REPLICATE = case_when(grepl(paste0(reps[1],"$"),labelid_viallabel) ~ reps[1],
                                     grepl(paste0(reps[2],"$"),labelid_viallabel) ~ reps[2],
                                    grepl(paste0(reps[3],"$"),labelid_viallabel) ~ reps[3])) %>%
  map(~rename(., !!sym(unique(.$REPLICATE)) := "VALUE")) %>%
  map(~select(., -REPLICATE)) %>%
  map(mutate, labelid = gsub(pattern = "_..*", replacement = '', labelid_viallabel)) %>%
  map(select, -labelid_viallabel) %>%
  purrr::reduce(left_join, by = c("labelid","METABOLITE_NAME"))

# Plot scatter plots ordered by sample (include R2 values)
lm_df <- norm_join_df %>%
  rename(name = METABOLITE_NAME) %>%
  rename(x = reps[1]) %>%
  rename(y = reps[2])
# Iterate through each of the shared metabolites to collect summary info about their correlations
r2_df <- data.frame()
for(metab in metabolites){
  met_df <- lm_df %>%
    filter(name == metab)
  r2_df <- rbind(r2_df, lm_eqn(met_df))
}
# Display summaries
met_r2_df <- r2_df %>%
  arrange(METABOLITE) %>%
  select(METABOLITE,R2) %>%
  arrange(desc(R2))
met_r2_df %>%
  knitr::kable(format = "html") %>%
  scroll_box(width = "100%", height = "200px")
METABOLITE R2
2-Aminobutyric acid 0.624
gamma-Aminobutyric acid 0.613
Alanine 0.453
Tyrosine 0.402
Tryptophan 0.395
Leucine 0.348
Methionine 0.344
Citrulline 0.302
Glutamic acid 0.299
Ethanolamine 0.27
Isoleucine 0.265
Asparagine 0.256
Threonine 0.246
Hydroxyproline 0.234
Aspartic acid 0.227
Lysine 0.225
Valine 0.219
Glutamine 0.213
Serine 0.213
Phenylalanine 0.212
Arginine 0.206
Sarcosine 0.204
Taurine 0.201
Aminoadipic acid 0.195
Proline 0.164
Glycine 0.156
Phosphoethanolamine 0.145
Cysteine 0.136
Histidine 0.132
Cystathionine 0.126
Ornithine 0.0668
beta-Alanine 0.027
1-Methylhistidine 0.0198
3-Methylhistidine 0.014
# Collect the order of metabolites
scat_met_order <- met_r2_df %>% select(METABOLITE) %>% unlist() %>% as.character()

# Plot scatter plots (Normalized)
################################################################################
# Plot the scatter plots
for(metab in scat_met_order){
  p <- norm_join_df %>%
    filter(METABOLITE_NAME == metab) %>%
  ggplot(aes(x = !!sym(reps[1]), y = !!sym(reps[2])), color = ) +
  geom_point(alpha = 0.5) +
  geom_smooth(method = "lm", size = 1) +
  geom_abline(linetype="dashed") +
  facet_wrap(~ METABOLITE_NAME) +
  expand_limits(x = 0, y = 0) +
  #labs(title=paste0(tissue,' ',study_institute_metab_family,": Normalized Metabolite Abundances")) +
  coord_fixed(ratio=1)
  plot(p)
}

# norm_join_df %>%
#   mutate(METABOLITE_NAME = factor(METABOLITE_NAME, levels = scat_met_order)) %>%
#   ggplot(aes(x = !!sym(reps[1]), y = !!sym(reps[2])), color = ) +
#   geom_point(alpha = 0.5) +
#   geom_smooth(method = "lm", size = 1) +
#   geom_abline(linetype="dashed") +
#   facet_wrap(~ METABOLITE_NAME) +
#   expand_limits(x = 0, y = 0) +
#   labs(title=paste0(tissue,' ',study_institute_metab_family,": Normalized Metabolite Abundances")) +
#   coord_fixed(ratio=1)

Distributions (Normalized Abundances)

# Plot the density plot for all the gene counts
################################################################################
norm_df %>%
  ggplot(aes(x = VALUE, color = REPLICATE)) +
  geom_density() +
  labs(title=paste0(tissue,' ',study_institute_metab_family,": Normalized Metabolite Abundances"),
       x = "Abundance",
       y = "Density")

# Summary Statistics of Density plot distribution
################################################################################
# Iterate through the runs and collect vectors
df_all <- data.frame()
for(rep in reps){
  # Collect numeric vector
  num_vec <- norm_df %>%
    filter(REPLICATE == rep) %>%
    select(VALUE) %>% unlist() %>% unname()
  df <- NumericSummaryStats(num_vec)
  row.names(df) <- rep
  df_all <- rbind(df_all, df)
}
df_all
##     NA_COUNT NA_FREQ MEAN MEDIAN  MAX   MIN MID_RANGE VARIANCE STD_DEV SE_MEAN    Q1   Q3 KURTOSIS SKEW
## 011        0       0    0  -0.14 8.72 -1.88     10.60     0.99    0.99    0.02 -0.71 0.60     3.05 1.04
## 012        0       0    0  -0.23 6.40 -1.78      8.18     0.99    0.99    0.02 -0.69 0.57     2.08 1.08
##     BC_LAMBDA
## 011      None
## 012      None
#}

Sample by Sample Correlations (Normalized Abundances)

################################################################################
####################### Sample by Sample Correlations ##########################
################################################################################

# Subset Data
################################################################################
# Original NxP Matrix
x <- plot_df %>%
  unite(labelid,REPLICATE, 
        col = labelid_REPLICATE, remove = F) %>%
  select(labelid_REPLICATE, METABOLITE_NAME, VALUE,REPLICATE) %>%
  split(.$REPLICATE) %>%
  map(select, -REPLICATE) %>%
  map(pivot_wider, names_from = METABOLITE_NAME, values_from = VALUE) %>%
  map(tibble::column_to_rownames, var = "labelid_REPLICATE") %>%
  map(as.matrix) %>%
  map(AutoScaleMatrix)
# Subset matrices
shared_sam_mat <-  do.call(rbind,x) %>% 
  t()

# Create an NxN Matrix of correlation
Corr <- 'Spearman'
if(Corr == 'Pearson'){
        cor1 <- stats::cor(shared_sam_mat, method = 'pearson') %>% round(digits = 3)
}else if(Corr == 'Spearman'){
        cor1 <- stats::cor(shared_sam_mat, method = 'spearman') %>% round(digits = 3)
}

# Melt the correlation matrix
melted_cormat <- melt(cor1, na.rm = TRUE)

################################################################################
####################### Unclustered ############################################
################################################################################

# Remove rows that compare metabolites from the same dataset, them remove redundent measurements
melted_cormat2 <- melted_cormat

# Widen the dataframwe back to matrix
cormat2 <- melted_cormat2 %>%
  pivot_wider(names_from = Var2, values_from = value) %>%
  as.data.frame()
row.names(cormat2) <- cormat2$Var1
cormat2 <- cormat2 %>%
  select(-Var1) %>%
  as.matrix()
cormat2 <- cormat2[sort(rownames(cormat2)),sort(colnames(cormat2))]
cormat2 <- cormat2[grepl(reps[1], rownames(cormat2)),grepl(reps[2], colnames(cormat2))]

# Orient the colors and breaks
paletteLength <- 1000
myColor <- colorRampPalette(c("blue", "white", "red"))(paletteLength)
# length(breaks) == length(paletteLength) + 1
# use floor and ceiling to deal with even/odd length pallettelengths
myBreaks <- c(seq(-1, 0, length.out=ceiling(paletteLength/2) + 1), 
              seq(1/paletteLength, max(cor1, na.rm = TRUE), length.out=floor(paletteLength/2)))

# Plot the heatmap
pheatmap(as.matrix(cormat2), color=myColor, breaks=myBreaks, 
                  cluster_rows = F, cluster_cols = F,fontsize = 6)

PCA Analysis (Normalized Abundances)

# Plot a PCA with outliers labeled
# shared_sam_mat <- shared_sam_mat %>%
#   t() %>% AutoScaleMatrix() %>%
#   t()
pca <- prcomp(na.omit(t(shared_sam_mat)), scale. = F)
percentVar <- pca$sdev^2/sum(pca$sdev^2)
PC <- pca$x %>% as.data.frame()
PC$labelid_rep <- as.character(row.names(pca$x))
PC <- PC %>%
  tidyr::separate(col = labelid_rep, into = c('labelid', 'REPLICATE'),
  sep = "_", remove = T)
# Join the phenotype data
PC <- left_join(PC, pheno_df, by = 'labelid')

p0 <- PC %>%
  ggplot(aes(x = PC1, y = PC2, color=REPLICATE)) +
  geom_point(size = 3) +
  geom_line(aes(group = labelid),color="dark grey") +
  ggtitle('Shared Samples and Shared Metabolites') + 
  xlab(paste0('PC1: ',round(percentVar[1]*100,2),'% variance')) +
  ylab(paste0('PC2: ',round(percentVar[2]*100,2),'% variance')) +
  theme(aspect.ratio=1)
p0

p1 <- PC %>%
        ggplot(aes(x = PC1, y = PC2, color=Key.anirandgroup, shape = REPLICATE)) +
        geom_point(size = 3) +
  geom_line(aes(group = labelid),color="dark grey") +
        ggtitle('Shared Samples and Shared Metabolites') + 
        xlab(paste0('PC1: ',round(percentVar[1]*100,2),'% variance')) +
        ylab(paste0('PC2: ',round(percentVar[2]*100,2),'% variance')) +
        scale_color_manual(values=ec_colors) +
        theme(aspect.ratio=1)
p1

PC$Registration.sex <- as.character(PC$Registration.sex)
p2 <- PC %>%
        ggplot(aes(x = PC1, y = PC2, color=Registration.sex, shape = REPLICATE)) +
        geom_point(size = 3) +
  geom_line(aes(group = labelid),color="dark grey") +
        ggtitle('Shared Samples and Shared Metabolites') + 
        xlab(paste0('PC1: ',round(percentVar[1]*100,2),'% variance')) +
        ylab(paste0('PC2: ',round(percentVar[2]*100,2),'% variance')) +
        theme(aspect.ratio=1)
p2